
"""
Processing of ZINC dataset.
The ZINC database is a curated collection of commercially available chemical compounds prepared especially for virtual screening. ZINC15 is designed to bring together biology and chemoinformatics with a tool that is easy to use for nonexperts, while remaining fully programmable for chemoinformaticians and computational biologists.
"""

import os
from os.path import join
import pandas as pd

from helixwrapper.datasets.inmemory_dataset import InMemoryDataset

__all__ = ['load_zinc_dataset']


def load_zinc_dataset(data_path):
    """Load ZINC dataset,process the input information.
    Description:
        
        The data file contains a csv table, in which columns below are used:
            
            smiles:  SMILES representation of the molecular structure.
            
            zinc_id: the id of the compound
    Args:
        data_path(str): the path to the cached npz path.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python
            dataset = load_zinc_dataset('./zinc')
            print(len(dataset))
    References:
    
    [1]Teague Sterling and John J. Irwin. Zinc 15 – ligand discovery for everyone. Journal of Chemical Information and Modeling, 55(11):2324–2337, 2015. doi: 10.1021/acs.jcim.5b00559. PMID: 26479676.
    """
    smiles_list = _load_zinc_dataset(data_path)
        
    data_list = []
    for i in range(len(smiles_list)):
        data = {}
        data['smiles'] = smiles_list[i]        
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset


def _load_zinc_dataset(data_path):
    """
    Args:
        data_path(str): the path to the cached npz path.
        
    Returns:
        smile_list: the smile list of the input.
    """
    raw_path = join(data_path, 'raw')
    csv_file = os.listdir(raw_path)[0]
    input_df = pd.read_csv(
            join(raw_path, csv_file), sep=',', compression='gzip', dtype='str')
    smiles_list = list(input_df['smiles'])
    return smiles_list